Data Science Intern at CodeClause
CODECLAUSE PROJECT 1 : Churn Prediction in Telecom Industry Using Logistic Regression
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import plotly.express as px
#INPORTING DATASET
data1 = pd.read_csv("C:/Users/Nupur/Documents/A INTERNSHIP/Telco data.csv")
data1.head()
| customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | ... | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7590-VHVEG | Female | 0 | Yes | No | 1 | No | No phone service | DSL | No | ... | No | No | No | No | Month-to-month | Yes | Electronic check | 29.85 | 29.85 | No |
| 1 | 5575-GNVDE | Male | 0 | No | No | 34 | Yes | No | DSL | Yes | ... | Yes | No | No | No | One year | No | Mailed check | 56.95 | 1889.5 | No |
| 2 | 3668-QPYBK | Male | 0 | No | No | 2 | Yes | No | DSL | Yes | ... | No | No | No | No | Month-to-month | Yes | Mailed check | 53.85 | 108.15 | Yes |
| 3 | 7795-CFOCW | Male | 0 | No | No | 45 | No | No phone service | DSL | Yes | ... | Yes | Yes | No | No | One year | No | Bank transfer (automatic) | 42.30 | 1840.75 | No |
| 4 | 9237-HQITU | Female | 0 | No | No | 2 | Yes | No | Fiber optic | No | ... | No | No | No | No | Month-to-month | Yes | Electronic check | 70.70 | 151.65 | Yes |
5 rows × 21 columns
data1.shape
(7043, 21)
data1.describe()
| SeniorCitizen | tenure | MonthlyCharges | |
|---|---|---|---|
| count | 7043.000000 | 7043.000000 | 7043.000000 |
| mean | 0.162147 | 32.371149 | 64.761692 |
| std | 0.368612 | 24.559481 | 30.090047 |
| min | 0.000000 | 0.000000 | 18.250000 |
| 25% | 0.000000 | 9.000000 | 35.500000 |
| 50% | 0.000000 | 29.000000 | 70.350000 |
| 75% | 0.000000 | 55.000000 | 89.850000 |
| max | 1.000000 | 72.000000 | 118.750000 |
# Checking Null values
data1.notnull().sum()
customerID 7043 gender 7043 SeniorCitizen 7043 Partner 7043 Dependents 7043 tenure 7043 PhoneService 7043 MultipleLines 7043 InternetService 7043 OnlineSecurity 7043 OnlineBackup 7043 DeviceProtection 7043 TechSupport 7043 StreamingTV 7043 StreamingMovies 7043 Contract 7043 PaperlessBilling 7043 PaymentMethod 7043 MonthlyCharges 7043 TotalCharges 7043 Churn 7043 dtype: int64
#There is no missing value in our dataset
%matplotlib inline
data_hist = px.histogram(data1, x='gender',color='Churn',marginal='box', color_discrete_sequence = ['darksalmon','dimgrey'])
data_hist.update_layout(bargap=0.2)
plt.bar(data1['gender'],data1['Churn'])
<BarContainer object of 7043 artists>
data1.hist(bins = 30, figsize=(20,15))
array([[<AxesSubplot:title={'center':'SeniorCitizen'}>,
<AxesSubplot:title={'center':'tenure'}>],
[<AxesSubplot:title={'center':'MonthlyCharges'}>, <AxesSubplot:>]],
dtype=object)
sns.pairplot(data1)
<seaborn.axisgrid.PairGrid at 0x2222b4b08b0>
#Removing gender, customerID,tenture they are not usefull
col = ['gender','customerID','tenure']
data1 = data1.drop(col,axis = 1)
sns.pairplot(data1)
<seaborn.axisgrid.PairGrid at 0x2222b4fbe80>
data1['TotalCharges'].notnull().sum()
7043
data1['MonthlyCharges'].describe()
count 7043.000000 mean 64.761692 std 30.090047 min 18.250000 25% 35.500000 50% 70.350000 75% 89.850000 max 118.750000 Name: MonthlyCharges, dtype: float64
data1['TotalCharges'].describe()
#the datatype of the TotalCharges is Object so we will changec that
count 7043 unique 6531 top freq 11 Name: TotalCharges, dtype: object
#due to string(" ") at 488 position you can not change the TotalCharges into Int
#so we will be removing/replacing that string which is --> " "
data1['TotalCharges'] = data1['TotalCharges'].replace(" ",np.nan)
data1['TotalCharges'] = pd.to_numeric(data1['TotalCharges'], errors = 'coerce') # coerce will replace all the non-numeric values with NaN
#dropping all the rows in which there is a null value
data1 = data1.dropna(how = "any", axis = 0) #removing all the rows which have null value in it
data1['TotalCharges'].describe()
count 7032.000000 mean 2283.300441 std 2266.771362 min 18.800000 25% 401.450000 50% 1397.475000 75% 3794.737500 max 8684.800000 Name: TotalCharges, dtype: float64
data1.notnull().sum()
SeniorCitizen 7032 Partner 7032 Dependents 7032 PhoneService 7032 MultipleLines 7032 InternetService 7032 OnlineSecurity 7032 OnlineBackup 7032 DeviceProtection 7032 TechSupport 7032 StreamingTV 7032 StreamingMovies 7032 Contract 7032 PaperlessBilling 7032 PaymentMethod 7032 MonthlyCharges 7032 TotalCharges 7032 Churn 7032 dtype: int64
data1['Churn'].describe()
count 7032 unique 2 top No freq 5163 Name: Churn, dtype: object
for i, predictor in enumerate(data1.drop(columns=['Churn', 'TotalCharges', 'MonthlyCharges'])):
ax = sns.countplot(data = data1, x = predictor, hue='Churn')
if predictor == "PaymentMethod":
ax.set_xticklabels(ax.get_xticklabels(), fontsize=7)
plt.tight_layout()
plt.show()
else:
plt.tight_layout()
plt.show()
#converting Yes as 1 and No as 0
data1["Churn"] = data1["Churn"].replace(['Yes','No'],[1,0])
C:\Users\Nupur\AppData\Local\Temp\ipykernel_19184\151837023.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
data1
| SeniorCitizen | Partner | Dependents | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | Yes | No | No | No phone service | DSL | No | Yes | No | No | No | No | Month-to-month | Yes | Electronic check | 29.85 | 29.85 | 0 |
| 1 | 0 | No | No | Yes | No | DSL | Yes | No | Yes | No | No | No | One year | No | Mailed check | 56.95 | 1889.50 | 0 |
| 2 | 0 | No | No | Yes | No | DSL | Yes | Yes | No | No | No | No | Month-to-month | Yes | Mailed check | 53.85 | 108.15 | 1 |
| 3 | 0 | No | No | No | No phone service | DSL | Yes | No | Yes | Yes | No | No | One year | No | Bank transfer (automatic) | 42.30 | 1840.75 | 0 |
| 4 | 0 | No | No | Yes | No | Fiber optic | No | No | No | No | No | No | Month-to-month | Yes | Electronic check | 70.70 | 151.65 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7038 | 0 | Yes | Yes | Yes | Yes | DSL | Yes | No | Yes | Yes | Yes | Yes | One year | Yes | Mailed check | 84.80 | 1990.50 | 0 |
| 7039 | 0 | Yes | Yes | Yes | Yes | Fiber optic | No | Yes | Yes | No | Yes | Yes | One year | Yes | Credit card (automatic) | 103.20 | 7362.90 | 0 |
| 7040 | 0 | Yes | Yes | No | No phone service | DSL | Yes | No | No | No | No | No | Month-to-month | Yes | Electronic check | 29.60 | 346.45 | 0 |
| 7041 | 1 | Yes | No | Yes | Yes | Fiber optic | No | No | No | No | No | No | Month-to-month | Yes | Mailed check | 74.40 | 306.60 | 1 |
| 7042 | 0 | No | No | Yes | No | Fiber optic | Yes | No | Yes | Yes | Yes | Yes | Two year | Yes | Bank transfer (automatic) | 105.65 | 6844.50 | 0 |
7032 rows × 18 columns
telco_data_dummies = pd.get_dummies(data1)
telco_data_dummies
| SeniorCitizen | MonthlyCharges | TotalCharges | Churn | Partner_No | Partner_Yes | Dependents_No | Dependents_Yes | PhoneService_No | PhoneService_Yes | ... | StreamingMovies_Yes | Contract_Month-to-month | Contract_One year | Contract_Two year | PaperlessBilling_No | PaperlessBilling_Yes | PaymentMethod_Bank transfer (automatic) | PaymentMethod_Credit card (automatic) | PaymentMethod_Electronic check | PaymentMethod_Mailed check | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 29.85 | 29.85 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
| 1 | 0 | 56.95 | 1889.50 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | ... | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
| 2 | 0 | 53.85 | 108.15 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 |
| 3 | 0 | 42.30 | 1840.75 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | ... | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 |
| 4 | 0 | 70.70 | 151.65 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7038 | 0 | 84.80 | 1990.50 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | ... | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 |
| 7039 | 0 | 103.20 | 7362.90 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | ... | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 |
| 7040 | 0 | 29.60 | 346.45 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
| 7041 | 1 | 74.40 | 306.60 | 1 | 0 | 1 | 1 | 0 | 0 | 1 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 |
| 7042 | 0 | 105.65 | 6844.50 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | ... | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 |
7032 rows × 43 columns
churn_corr_matrix = telco_data_dummies.corr()
churn_corr_matrix['Churn'].sort_values(ascending = False).plot(kind='bar',figsize = (15,10))
<AxesSubplot:>
HIGH Churn seen in case of monthly contracts, no online security, no technical support, first year subscription and fiber optic Internet;
LOW Churn is seen in the case of long-term contracts, subscriptions without internet service and customers contracted for more than 5 years;
Factors such as gender, phone service availability, and number of multiple lines have almost NO impact on churn!
churn_corr_matrix['Churn'].sort_values(ascending = False)
Churn 1.000000 Contract_Month-to-month 0.404565 OnlineSecurity_No 0.342235 TechSupport_No 0.336877 InternetService_Fiber optic 0.307463 PaymentMethod_Electronic check 0.301455 OnlineBackup_No 0.267595 DeviceProtection_No 0.252056 MonthlyCharges 0.192858 PaperlessBilling_Yes 0.191454 Dependents_No 0.163128 SeniorCitizen 0.150541 Partner_No 0.149982 StreamingMovies_No 0.130920 StreamingTV_No 0.128435 StreamingTV_Yes 0.063254 StreamingMovies_Yes 0.060860 MultipleLines_Yes 0.040033 PhoneService_Yes 0.011691 PhoneService_No -0.011691 MultipleLines_No phone service -0.011691 MultipleLines_No -0.032654 DeviceProtection_Yes -0.066193 OnlineBackup_Yes -0.082307 PaymentMethod_Mailed check -0.090773 PaymentMethod_Bank transfer (automatic) -0.118136 InternetService_DSL -0.124141 PaymentMethod_Credit card (automatic) -0.134687 Partner_Yes -0.149982 Dependents_Yes -0.163128 TechSupport_Yes -0.164716 OnlineSecurity_Yes -0.171270 Contract_One year -0.178225 PaperlessBilling_No -0.191454 TotalCharges -0.199484 StreamingTV_No internet service -0.227578 OnlineSecurity_No internet service -0.227578 InternetService_No -0.227578 StreamingMovies_No internet service -0.227578 OnlineBackup_No internet service -0.227578 TechSupport_No internet service -0.227578 DeviceProtection_No internet service -0.227578 Contract_Two year -0.301552 Name: Churn, dtype: float64
x = telco_data_dummies.drop('Churn',axis = 1)
x
| SeniorCitizen | MonthlyCharges | TotalCharges | Partner_No | Partner_Yes | Dependents_No | Dependents_Yes | PhoneService_No | PhoneService_Yes | MultipleLines_No | ... | StreamingMovies_Yes | Contract_Month-to-month | Contract_One year | Contract_Two year | PaperlessBilling_No | PaperlessBilling_Yes | PaymentMethod_Bank transfer (automatic) | PaymentMethod_Credit card (automatic) | PaymentMethod_Electronic check | PaymentMethod_Mailed check | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 29.85 | 29.85 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
| 1 | 0 | 56.95 | 1889.50 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | ... | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
| 2 | 0 | 53.85 | 108.15 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 |
| 3 | 0 | 42.30 | 1840.75 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 |
| 4 | 0 | 70.70 | 151.65 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7038 | 0 | 84.80 | 1990.50 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | ... | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 |
| 7039 | 0 | 103.20 | 7362.90 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | ... | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 |
| 7040 | 0 | 29.60 | 346.45 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
| 7041 | 1 | 74.40 | 306.60 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 |
| 7042 | 0 | 105.65 | 6844.50 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | ... | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 |
7032 rows × 42 columns
y = telco_data_dummies['Churn']
y
0 0
1 0
2 1
3 0
4 1
..
7038 0
7039 0
7040 0
7041 1
7042 0
Name: Churn, Length: 7032, dtype: int64
x.shape
(7032, 42)
y.shape
(7032,)
y.value_counts()
0 5163 1 1869 Name: Churn, dtype: int64
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=0)
x_resampled_smote, y_resampled_smote = smote.fit_resample(x,y)
y_resampled_smote.value_counts()
0 5163 1 5163 Name: Churn, dtype: int64
x_resampled_smote
| SeniorCitizen | MonthlyCharges | TotalCharges | Partner_No | Partner_Yes | Dependents_No | Dependents_Yes | PhoneService_No | PhoneService_Yes | MultipleLines_No | ... | StreamingMovies_Yes | Contract_Month-to-month | Contract_One year | Contract_Two year | PaperlessBilling_No | PaperlessBilling_Yes | PaymentMethod_Bank transfer (automatic) | PaymentMethod_Credit card (automatic) | PaymentMethod_Electronic check | PaymentMethod_Mailed check | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 29.850000 | 29.850000 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
| 1 | 0 | 56.950000 | 1889.500000 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | ... | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
| 2 | 0 | 53.850000 | 108.150000 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 |
| 3 | 0 | 42.300000 | 1840.750000 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 |
| 4 | 0 | 70.700000 | 151.650000 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10321 | 0 | 103.976753 | 242.804921 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | ... | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 10322 | 0 | 35.824447 | 35.824447 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | ... | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 10323 | 0 | 44.493077 | 1061.960339 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 10324 | 0 | 19.363055 | 19.363055 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 |
| 10325 | 0 | 96.922890 | 96.922890 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | ... | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
10326 rows × 42 columns
y_resampled_smote.notnull().sum()
10326
x_resampled_smote.notnull().sum()
SeniorCitizen 10326 MonthlyCharges 10326 TotalCharges 10326 Partner_No 10326 Partner_Yes 10326 Dependents_No 10326 Dependents_Yes 10326 PhoneService_No 10326 PhoneService_Yes 10326 MultipleLines_No 10326 MultipleLines_No phone service 10326 MultipleLines_Yes 10326 InternetService_DSL 10326 InternetService_Fiber optic 10326 InternetService_No 10326 OnlineSecurity_No 10326 OnlineSecurity_No internet service 10326 OnlineSecurity_Yes 10326 OnlineBackup_No 10326 OnlineBackup_No internet service 10326 OnlineBackup_Yes 10326 DeviceProtection_No 10326 DeviceProtection_No internet service 10326 DeviceProtection_Yes 10326 TechSupport_No 10326 TechSupport_No internet service 10326 TechSupport_Yes 10326 StreamingTV_No 10326 StreamingTV_No internet service 10326 StreamingTV_Yes 10326 StreamingMovies_No 10326 StreamingMovies_No internet service 10326 StreamingMovies_Yes 10326 Contract_Month-to-month 10326 Contract_One year 10326 Contract_Two year 10326 PaperlessBilling_No 10326 PaperlessBilling_Yes 10326 PaymentMethod_Bank transfer (automatic) 10326 PaymentMethod_Credit card (automatic) 10326 PaymentMethod_Electronic check 10326 PaymentMethod_Mailed check 10326 dtype: int64
from sklearn.linear_model import LogisticRegression
#checking on imbalance data
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)
LogReg = LogisticRegression(solver='lbfgs', max_iter=400)
LogReg.fit(x_train,y_train)
LogisticRegression(max_iter=400)
y_pred = LogReg.predict(x_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)
0.7867803837953091
#checking on balanced data
x_smote_train,x_smote_test,y_smote_train,y_smote_test = train_test_split(x_resampled_smote,y_resampled_smote,test_size=0.2,random_state=42)
LogReg.fit(x_smote_train,y_smote_train)
C:\Users\Nupur\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:814: ConvergenceWarning:
lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
LogisticRegression(max_iter=400)
y_smote_pred = LogReg.predict(x_smote_test)
accuracy_score(y_smote_test,y_smote_pred)
0.8325266214908035